# Remove all variables from the R environment to create a fresh start
rm(list=ls())
# Load datasets
train1 <- read.csv("train_dataset01.csv")
train2 <- read.csv("train_dataset02.csv")
test <- read.csv("test_dataset.csv")
levels(train2$STATUS_PU3) <- c("False", "True")
levels(train2$STATUS_PU5) <- c("False", "True")
levels(train2$STATUS_PU8) <- c("False", "True")
levels(train2$STATUS_PU9) <- c("False", "True")
levels(test$STATUS_PU8) <- c("False", "True")
levels(test$STATUS_PU9) <- c("False", "True")
Split
library(caTools)
set.seed(100)
spl <-sample.split(train2$ATT_FLAG, SplitRatio =0.7)
attackTrain <- subset(train2, spl == TRUE)
attackTest <- subset(train2, spl == FALSE)
Train and validate (CART)
# Random Forest
library(randomForest)
# Build the model
attackTrain$DATETIME <- NULL
model1 <- randomForest(ATT_FLAG~., data=attackTrain)
summary(model1)
Length Class Mode
call 3 -none- call
type 1 -none- character
predicted 12686 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 25372 matrix numeric
oob.times 12686 -none- numeric
classes 2 -none- character
importance 43 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 12686 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
varImpPlot(model1)
# Prediction
predict1 <- predict(model1, newdata=attackTest)
cm <- table(predict1, attackTest$ATT_FLAG)
cm
predict1 False True
False 4844 40
True 3 550
precision <- cm[2,2]/sum(cm[2,])
recall <- cm[2,2]/sum(cm[,2])
f1 <- 2 * precision * recall / (precision + recall)
precision
[1] 0.994575
recall
[1] 0.9322034
f1
[1] 0.9623797
Train on whole train2 and predict test
# Build the model
train2$DATETIME <- NULL
model2 <- randomForest(ATT_FLAG~., data=train2)
summary(model2)
Length Class Mode
call 3 -none- call
type 1 -none- character
predicted 18123 factor numeric
err.rate 1500 -none- numeric
confusion 6 -none- numeric
votes 36246 matrix numeric
oob.times 18123 -none- numeric
classes 2 -none- character
importance 43 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 14 -none- list
y 18123 factor numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
varImpPlot(model2)
# Prediction
predict2 <- predict(model2, newdata=test)
See performance
test$ATT_FLAG <- predict2
test.ts <- ts(test)
ignore = c("LEVEL_T5", "FLOW_PU3", "FLOW_PU5", "FLOW_PU9", "STATUS_PU3", "STATUS_PU5", "STATUS_PU8", "STATUS_PU9")
test.small <- test[ , -which(names(test) %in% ignore)]
test.small.ts <- ts(test.small)
for (col in colnames(test.small.ts)) {
if (col != "DATETIME" & col != "ATT_FLAG") {
plot.ts(test.small.ts[,col], ylab=col, col=c("black"))
par(new = TRUE)
plot.ts(test.small.ts[,"ATT_FLAG"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="red")
}
}